In [1]:
# Importation du package pandas 
import pandas as pd
In [2]:
# Importation de la BD (base de données)
df = pd.read_csv("donnees_marketing_banque.csv", sep = ";")
df
Out[2]:
id age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 id1 58 management married tertiary no 2143.0 yes no unknown 5 may 261 1 -1 0 unknown no
1 id2 44 technician single secondary no 29.0 yes no unknown 5 may 151 1 -1 0 unknown no
2 id3 33 entrepreneur married secondary no 2.0 yes yes unknown 5 may 76 1 -1 0 unknown no
3 id4 47 blue-collar married unknown no 1506.0 yes no unknown 5 may 92 1 -1 0 unknown no
4 id5 33 unknown single unknown no 1.0 no no unknown 5 may 198 1 -1 0 unknown no
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45206 id45207 51 technician married tertiary no 825.0 no no cellular 17 nov 977 3 -1 0 unknown yes
45207 id45208 71 retired divorced primary no 1729.0 no no cellular 17 nov 456 2 -1 0 unknown yes
45208 id45209 72 retired married secondary no 5715.0 no no cellular 17 nov 1127 5 184 3 success yes
45209 id45210 57 blue-collar married secondary no 668.0 no no telephone 17 nov 508 4 -1 0 unknown no
45210 id45211 37 entrepreneur married secondary no 2971.0 no no cellular 17 nov 361 2 188 11 other no

45211 rows × 18 columns

In [3]:
# Les caractéristiques de la BD
df.head()
Out[3]:
id age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 id1 58 management married tertiary no 2143.0 yes no unknown 5 may 261 1 -1 0 unknown no
1 id2 44 technician single secondary no 29.0 yes no unknown 5 may 151 1 -1 0 unknown no
2 id3 33 entrepreneur married secondary no 2.0 yes yes unknown 5 may 76 1 -1 0 unknown no
3 id4 47 blue-collar married unknown no 1506.0 yes no unknown 5 may 92 1 -1 0 unknown no
4 id5 33 unknown single unknown no 1.0 no no unknown 5 may 198 1 -1 0 unknown no
In [4]:
df.tail()
Out[4]:
id age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
45206 id45207 51 technician married tertiary no 825.0 no no cellular 17 nov 977 3 -1 0 unknown yes
45207 id45208 71 retired divorced primary no 1729.0 no no cellular 17 nov 456 2 -1 0 unknown yes
45208 id45209 72 retired married secondary no 5715.0 no no cellular 17 nov 1127 5 184 3 success yes
45209 id45210 57 blue-collar married secondary no 668.0 no no telephone 17 nov 508 4 -1 0 unknown no
45210 id45211 37 entrepreneur married secondary no 2971.0 no no cellular 17 nov 361 2 188 11 other no
In [5]:
df.shape
Out[5]:
(45211, 18)
In [6]:
# La nature des variables de la BD
df.dtypes
Out[6]:
id            object
age            int64
job           object
marital       object
education     object
default       object
balance      float64
housing       object
loan          object
contact       object
day            int64
month         object
duration       int64
campaign       int64
pdays          int64
previous       int64
poutcome      object
y             object
dtype: object
In [7]:
# Description rapide de la BD
df.describe()
Out[7]:
age balance day duration campaign pdays previous
count 45211.000000 45206.000000 45211.000000 45211.000000 45211.000000 45211.000000 45211.000000
mean 40.931477 1362.403707 15.806419 258.163080 2.763841 40.197828 0.580323
std 10.623372 3044.906741 8.322476 257.527812 3.098021 100.128746 2.303441
min 10.000000 -8019.000000 1.000000 0.000000 1.000000 -1.000000 0.000000
25% 33.000000 72.000000 8.000000 103.000000 1.000000 -1.000000 0.000000
50% 39.000000 448.000000 16.000000 180.000000 2.000000 -1.000000 0.000000
75% 48.000000 1428.000000 21.000000 319.000000 3.000000 -1.000000 0.000000
max 95.000000 102127.000000 31.000000 4918.000000 63.000000 871.000000 275.000000
In [8]:
# Ajout des variables qualitatives
df.describe(include = "all")
Out[8]:
id age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
count 45211 45211.000000 45211 45211 45211 45211 45206.000000 45211 45211 45211 45211.000000 45211 45211.000000 45211.000000 45211.000000 45211.000000 45211 45211
unique 45211 NaN 12 3 4 2 NaN 2 2 3 NaN 12 NaN NaN NaN NaN 4 2
top id1 NaN blue-collar married secondary no NaN yes no cellular NaN may NaN NaN NaN NaN unknown no
freq 1 NaN 9732 27214 23202 44396 NaN 25130 37967 29285 NaN 13766 NaN NaN NaN NaN 36959 39922
mean NaN 40.931477 NaN NaN NaN NaN 1362.403707 NaN NaN NaN 15.806419 NaN 258.163080 2.763841 40.197828 0.580323 NaN NaN
std NaN 10.623372 NaN NaN NaN NaN 3044.906741 NaN NaN NaN 8.322476 NaN 257.527812 3.098021 100.128746 2.303441 NaN NaN
min NaN 10.000000 NaN NaN NaN NaN -8019.000000 NaN NaN NaN 1.000000 NaN 0.000000 1.000000 -1.000000 0.000000 NaN NaN
25% NaN 33.000000 NaN NaN NaN NaN 72.000000 NaN NaN NaN 8.000000 NaN 103.000000 1.000000 -1.000000 0.000000 NaN NaN
50% NaN 39.000000 NaN NaN NaN NaN 448.000000 NaN NaN NaN 16.000000 NaN 180.000000 2.000000 -1.000000 0.000000 NaN NaN
75% NaN 48.000000 NaN NaN NaN NaN 1428.000000 NaN NaN NaN 21.000000 NaN 319.000000 3.000000 -1.000000 0.000000 NaN NaN
max NaN 95.000000 NaN NaN NaN NaN 102127.000000 NaN NaN NaN 31.000000 NaN 4918.000000 63.000000 871.000000 275.000000 NaN NaN
In [10]:
# Variables manquantes
df.isnull().mean()
Out[10]:
id           0.000000
age          0.000000
job          0.000000
marital      0.000000
education    0.000000
default      0.000000
balance      0.000111
housing      0.000000
loan         0.000000
contact      0.000000
day          0.000000
month        0.000000
duration     0.000000
campaign     0.000000
pdays        0.000000
previous     0.000000
poutcome     0.000000
y            0.000000
dtype: float64
In [11]:
# conserver uniquement les clients majeurs
df_majeur = df[df["age"] > 18]
df_majeur.describe()
Out[11]:
age balance day duration campaign pdays previous
count 45191.000000 45186.000000 45191.000000 45191.000000 45191.000000 45191.000000 45191.000000
mean 40.942223 1362.894967 15.809387 258.180501 2.764422 40.208072 0.580447
std 10.613323 3045.476834 8.321944 257.574852 3.098537 100.144958 2.303865
min 19.000000 -8019.000000 1.000000 0.000000 1.000000 -1.000000 0.000000
25% 33.000000 72.000000 8.000000 103.000000 1.000000 -1.000000 0.000000
50% 39.000000 449.000000 16.000000 180.000000 2.000000 -1.000000 0.000000
75% 48.000000 1428.000000 21.000000 319.000000 3.000000 -1.000000 0.000000
max 95.000000 102127.000000 31.000000 4918.000000 63.000000 871.000000 275.000000
In [12]:
# Nombredes colonnes
df.columns
Out[12]:
Index(['id', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'y'],
      dtype='object')
In [13]:
# Variables quantitatives univariées (histogramme, Diagramme en boîte (boxplot, courbe de densité))
# Matplotlib
# Importation de matplotlib
import matplotlib.pyplot as plt
In [14]:
# Histogramme de la variable age
plt.hist(df_majeur["age"])
Out[14]:
(array([ 2128., 12871., 10992.,  9008.,  6072.,  3310.,   429.,   251.,
          116.,    14.]),
 array([19. , 26.6, 34.2, 41.8, 49.4, 57. , 64.6, 72.2, 79.8, 87.4, 95. ]),
 <BarContainer object of 10 artists>)
In [16]:
# Personnaliser: Couleur, titre des axes et des graphiques,bins,etc 
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.title("Histogramme de l'âge")
plt.xlabel("Age")
plt.ylabel("Effectif")
Out[16]:
Text(0, 0.5, 'Effectif')
In [18]:
# Ajout d'une grille
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.title("Histogramme de l'âge")
plt.xlabel("Age")
plt.ylabel("Effectif")
plt.grid()
In [22]:
# Ajouter la taille du graphique
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.title("Histogramme de l'âge")
plt.xlabel("Age")
plt.ylabel("Effectif")
plt.figure(figsize = (5,5))
plt.show() # afficher le graphique
<Figure size 500x500 with 0 Axes>
In [24]:
# Changer les marques de l'axe des X
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.xticks(range(0, 100, 5))
Out[24]:
([<matplotlib.axis.XTick at 0x19faaf8c8d0>,
  <matplotlib.axis.XTick at 0x19faaf7e790>,
  <matplotlib.axis.XTick at 0x19fab044b90>,
  <matplotlib.axis.XTick at 0x19faafcca90>,
  <matplotlib.axis.XTick at 0x19fab094cd0>,
  <matplotlib.axis.XTick at 0x19fab096d10>,
  <matplotlib.axis.XTick at 0x19fab09cf10>,
  <matplotlib.axis.XTick at 0x19fab09e790>,
  <matplotlib.axis.XTick at 0x19fab09f390>,
  <matplotlib.axis.XTick at 0x19fab0a5350>,
  <matplotlib.axis.XTick at 0x19fab0a7350>,
  <matplotlib.axis.XTick at 0x19fab0b1510>,
  <matplotlib.axis.XTick at 0x19fab0b3450>,
  <matplotlib.axis.XTick at 0x19faaf97610>,
  <matplotlib.axis.XTick at 0x19fab0b5950>,
  <matplotlib.axis.XTick at 0x19fab0b4a90>,
  <matplotlib.axis.XTick at 0x19fab0b9b10>,
  <matplotlib.axis.XTick at 0x19fab0bbb10>,
  <matplotlib.axis.XTick at 0x19faab0e050>,
  <matplotlib.axis.XTick at 0x19fab0c23d0>],
 [Text(0, 0, '0'),
  Text(5, 0, '5'),
  Text(10, 0, '10'),
  Text(15, 0, '15'),
  Text(20, 0, '20'),
  Text(25, 0, '25'),
  Text(30, 0, '30'),
  Text(35, 0, '35'),
  Text(40, 0, '40'),
  Text(45, 0, '45'),
  Text(50, 0, '50'),
  Text(55, 0, '55'),
  Text(60, 0, '60'),
  Text(65, 0, '65'),
  Text(70, 0, '70'),
  Text(75, 0, '75'),
  Text(80, 0, '80'),
  Text(85, 0, '85'),
  Text(90, 0, '90'),
  Text(95, 0, '95')])
In [25]:
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.xticks(range(18, 90, 5))
Out[25]:
([<matplotlib.axis.XTick at 0x19fab10f890>,
  <matplotlib.axis.XTick at 0x19fab0d6790>,
  <matplotlib.axis.XTick at 0x19faad6e610>,
  <matplotlib.axis.XTick at 0x19fab1fad10>,
  <matplotlib.axis.XTick at 0x19fab1fc250>,
  <matplotlib.axis.XTick at 0x19fab1fed10>,
  <matplotlib.axis.XTick at 0x19fab11c810>,
  <matplotlib.axis.XTick at 0x19fab205a10>,
  <matplotlib.axis.XTick at 0x19fab205bd0>,
  <matplotlib.axis.XTick at 0x19fab2159d0>,
  <matplotlib.axis.XTick at 0x19fab217950>,
  <matplotlib.axis.XTick at 0x19fab219c90>,
  <matplotlib.axis.XTick at 0x19fab219410>,
  <matplotlib.axis.XTick at 0x19fab2201d0>,
  <matplotlib.axis.XTick at 0x19fab2222d0>],
 [Text(18, 0, '18'),
  Text(23, 0, '23'),
  Text(28, 0, '28'),
  Text(33, 0, '33'),
  Text(38, 0, '38'),
  Text(43, 0, '43'),
  Text(48, 0, '48'),
  Text(53, 0, '53'),
  Text(58, 0, '58'),
  Text(63, 0, '63'),
  Text(68, 0, '68'),
  Text(73, 0, '73'),
  Text(78, 0, '78'),
  Text(83, 0, '83'),
  Text(88, 0, '88')])
In [26]:
# Enregistrement du graphique
plt.savefig("graphique.png")
<Figure size 640x480 with 0 Axes>
In [27]:
# Boxplot de la variable quantitative age
plt.boxplot(df_majeur["age"])
plt.title("Boxplot de la variable âge")
plt.show()
In [28]:
# Graphique de la variable quantitative avec searborn
# importation de seaborn
import seaborn as sns
In [29]:
# Histogramme durée de la dernière communication en séconde
sns.histplot(df["duration"])
plt.title("Histogramme de la variable durée")
plt.figure(figsize = (5,5))
plt.show()
<Figure size 500x500 with 0 Axes>
In [30]:
# Ajout de la courbe de densité (kde = True)
sns.histplot(df["duration"], kde = True, color = "red")
plt.title("Histogramme de la variable durée")
plt.figure(figsize = (5,5))
plt.show()
<Figure size 500x500 with 0 Axes>
In [34]:
# Boxplot avec seaborn
import seaborn as sns
sns.histplot(df_majeur["duration"], color = "green")
plt.title("Boxplot de la variable durée")
plt.show()
In [35]:
sns.boxplot(df_majeur["duration"])
Out[35]:
<Axes: >
In [8]:
sns.seaborn(df_majeur["duration"])
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[8], line 1
----> 1 sns.seaborn(df_majeur["duration"])

AttributeError: module 'seaborn' has no attribute 'seaborn'
In [37]:
# Courbe de densité
sns.kdeplot((df_majeur["duration"]), color = "green")
plt.title("Densité de probabilité")
Out[37]:
Text(0.5, 1.0, 'Densité de probabilité')
In [38]:
# Graphique univarié des variables qualitatives 
# Diagramme en barre
# Diagramme en secteurs(camembert)
# Diagramme en barre en terme de nombre
# Détermination des effectifs par modalité
effectif = df_majeur["education"].value_counts().reset_index()
effectif
Out[38]:
education count
0 secondary 23197
1 tertiary 13298
2 primary 6846
3 unknown 1850
In [39]:
type(effectif)
Out[39]:
pandas.core.frame.DataFrame
In [68]:
couleurs = ["red", "blue", "green", "yellow"]
couleurs
Out[68]:
['red', 'blue', 'green', 'yellow']
In [77]:
# Graphique
import matplotlib.pyplot as plt
plt.bar(effectif["education"], effectif["count"], color = couleurs)
plt.xlabel("Niveau d'éducation")
plt.ylabel("effectif")
plt.title("Répartition par niveau d'éducation")
plt.show()
In [78]:
# Graphique avec proportion
effectif["proportion"] = round((100*effectif["count"]) / effectif["count"].sum())
plt.bar(effectif["education"], effectif["proportion"],color=couleurs)
Out[78]:
<BarContainer object of 4 artists>
In [79]:
effectif["proportion"] = round((100*effectif["count"]) / effectif["count"].sum())
plt.bar(effectif["education"], effectif["proportion"],color=couleurs)
for i, freq in enumerate(effectif["proportion"]):
    plt.text(i, freq, str(freq))
In [80]:
# Réaliser le pieChart
plt.pie(effectif["count"], labels=effectif["education"], autopct="%1.1f%%", colors=couleurs)
plt.show()
In [82]:
#Barplot avec seaborn
sns.barplot(x = "education", y="count", data = effectif)
for i, freq in enumerate(effectif["proportion"]):
    plt.text(i, freq, str(freq))
In [10]:
df = pd.read_csv("donnees_marketing_banque.csv", sep = ";")
df
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 df = pd.read_csv("donnees_marketing_banque.csv", sep = ";")
      2 df

NameError: name 'pd' is not defined
In [9]:
# Analyse des variables qualitatives et quantitatives
# Boxplot entre la variable y et l'âge
# Exemple d'une variable
# relation entre le fait de souscrire au dépôt et la durée du prêt
import seaborn as sns
df_majeur = df[df["age"] > 18]
sns.boxplot(x ="y", y ="age", data = df_majeur)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 6
      1 # Analyse des variables qualitatives et quantitatives
      2 # Boxplot entre la variable y et l'âge
      3 # Exemple d'une variable
      4 # relation entre le fait de souscrire au dépôt et la durée du prêt
      5 import seaborn as sns
----> 6 df_majeur = df[df["age"] > 18]
      7 sns.boxplot(x ="y", y ="age", data = df_majeur)

NameError: name 'df' is not defined
In [13]:
import pandas as pd
In [14]:
df = pd.read_csv("donnees_marketing_banque.csv", sep = ";")
df
Out[14]:
id age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
0 id1 58 management married tertiary no 2143.0 yes no unknown 5 may 261 1 -1 0 unknown no
1 id2 44 technician single secondary no 29.0 yes no unknown 5 may 151 1 -1 0 unknown no
2 id3 33 entrepreneur married secondary no 2.0 yes yes unknown 5 may 76 1 -1 0 unknown no
3 id4 47 blue-collar married unknown no 1506.0 yes no unknown 5 may 92 1 -1 0 unknown no
4 id5 33 unknown single unknown no 1.0 no no unknown 5 may 198 1 -1 0 unknown no
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
45206 id45207 51 technician married tertiary no 825.0 no no cellular 17 nov 977 3 -1 0 unknown yes
45207 id45208 71 retired divorced primary no 1729.0 no no cellular 17 nov 456 2 -1 0 unknown yes
45208 id45209 72 retired married secondary no 5715.0 no no cellular 17 nov 1127 5 184 3 success yes
45209 id45210 57 blue-collar married secondary no 668.0 no no telephone 17 nov 508 4 -1 0 unknown no
45210 id45211 37 entrepreneur married secondary no 2971.0 no no cellular 17 nov 361 2 188 11 other no

45211 rows × 18 columns

In [15]:
import seaborn as sns
In [17]:
df_majeur = df[df["age"] > 18]
In [18]:
sns.boxplot(x="y", y="duration", data = df_majeur)
Out[18]:
<Axes: xlabel='y', ylabel='duration'>
In [19]:
sns.boxplot(x="y", y="age", data = df_majeur)
Out[19]:
<Axes: xlabel='y', ylabel='age'>
In [20]:
# Violinplot
sns.violinplot(x="y", y="age", data = df_majeur)
Out[20]:
<Axes: xlabel='y', ylabel='age'>
In [21]:
# Deux Varaibles qualitatives
# Tableau de contingence entre deux variables qualitatives
# créer un tableau croisé
table_contingence = pd.crosstab(df_majeur["y"], df_majeur["education"])
table_contingence
Out[21]:
education primary secondary tertiary unknown
y
no 6257 20747 11302 1603
yes 589 2450 1996 247
In [23]:
# faire le complot
sns.countplot(x="y", hue = "education", data = df_majeur)
Out[23]:
<Axes: xlabel='y', ylabel='count'>
In [24]:
# Proportion option normalize
prop_table = df_majeur.groupby("y")["education"].value_counts(normalize=True).reset_index(name="proportion")
prop_table
Out[24]:
y education proportion
0 no secondary 0.519858
1 no tertiary 0.283194
2 no primary 0.156782
3 no unknown 0.040166
4 yes secondary 0.463839
5 yes tertiary 0.377887
6 yes primary 0.111511
7 yes unknown 0.046763
In [25]:
# Calculer les proportions pour chaque catégorie de y et éducation 
sns.barplot(x="y", y="proportion", hue = "education", data = prop_table)
Out[25]:
<Axes: xlabel='y', ylabel='proportion'>
In [27]:
# Deux variables quantitatives
# Scatter plot avec matplotlib entre l'âge et la durée
import matplotlib.pyplot as plt
plt.scatter(df_majeur["duration"], df_majeur["age"], c ="blue")
plt.xlabel("Durée")
plt.ylabel("Age")
Out[27]:
Text(0, 0.5, 'Age')
In [28]:
# sns
sns.scatterplot(x = "duration", y = "age", data = df_majeur)
Out[28]:
<Axes: xlabel='duration', ylabel='age'>
In [32]:
# Heatmap de corrélation entre les variables quantitatives 
var_quantitative = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]
df_quanti = df_majeur[var_quantitative]
df_quanti
Out[32]:
age balance day duration campaign pdays previous
0 58 2143.0 5 261 1 -1 0
1 44 29.0 5 151 1 -1 0
2 33 2.0 5 76 1 -1 0
3 47 1506.0 5 92 1 -1 0
4 33 1.0 5 198 1 -1 0
... ... ... ... ... ... ... ...
45206 51 825.0 17 977 3 -1 0
45207 71 1729.0 17 456 2 -1 0
45208 72 5715.0 17 1127 5 184 3
45209 57 668.0 17 508 4 -1 0
45210 37 2971.0 17 361 2 188 11

45191 rows × 7 columns

In [33]:
# Matrice de correlation 
matrice_correlation = df_quanti.corr()
matrice_correlation
Out[33]:
age balance day duration campaign pdays previous
age 1.000000 0.097703 -0.009394 -0.004741 0.004580 -0.023838 0.001271
balance 0.097703 1.000000 0.004320 0.021543 -0.014680 0.003381 0.016645
day -0.009394 0.004320 1.000000 -0.030197 0.162371 -0.093157 -0.051751
duration -0.004741 0.021543 -0.030197 1.000000 -0.084614 -0.001565 0.001188
campaign 0.004580 -0.014680 0.162371 -0.084614 1.000000 -0.088676 -0.032885
pdays -0.023838 0.003381 -0.093157 -0.001565 -0.088676 1.000000 0.454803
previous 0.001271 0.016645 -0.051751 0.001188 -0.032885 0.454803 1.000000
In [34]:
df_majeur.dtypes
Out[34]:
id            object
age            int64
job           object
marital       object
education     object
default       object
balance      float64
housing       object
loan          object
contact       object
day            int64
month         object
duration       int64
campaign       int64
pdays          int64
previous       int64
poutcome      object
y             object
dtype: object
In [35]:
# Graphique
sns.heatmap(matrice_correlation, annot=True, cmap="coolwarm", fmt=".2f")
Out[35]:
<Axes: >
In [ ]:
# Pairplot pour les liaisons rapides
sns.pairplot(df_majeur, hue="y")
In [38]:
# Plotly : Pour les graphiques dynamiques
import pandas as pd
In [39]:
import numpy as np
In [40]:
import plotly.express as px
In [49]:
# Créer un dataframe avec les données de séries temporelles fictives
np.random.seed(42)
date_rng = pd.date_range(start = "2022-01-01", end = "2022-12-31",freq='D')
data = {'Date':date_rng,
       'Valeur':np.random.randn(len(date_rng))}
df = pd.DataFrame(data)
In [50]:
# Créer un graphique de séries temporelles avec plotly express
fig = px.line(df, x='Date', y='Valeur', title = "exemple des séries temporelles")
In [51]:
# Afficher le graphique interactif
fig.show()
In [ ]: